03: Britain BirdTrack Analysis

Load in relevant packages and datasets:

Show code:
library(tidyverse)
library(plotly)
library(mgcv)
library(DT)
library(auk)

load("Data/BT_all.RData")
load("Data/BT_taxonomy.RData")
load("Data/BT_rrs.RData")

sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 22631)

Matrix products: default


locale:
[1] LC_COLLATE=English_United Kingdom.utf8 
[2] LC_CTYPE=English_United Kingdom.utf8   
[3] LC_MONETARY=English_United Kingdom.utf8
[4] LC_NUMERIC=C                           
[5] LC_TIME=English_United Kingdom.utf8    

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] auk_0.8.0       DT_0.33         mgcv_1.9-1      nlme_3.1-162   
 [5] plotly_4.10.4   lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1  
 [9] dplyr_1.1.4     purrr_1.0.2     readr_2.1.5     tidyr_1.3.1    
[13] tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] generics_0.1.3    stringi_1.8.4     lattice_0.21-8    hms_1.1.3        
 [5] digest_0.6.37     magrittr_2.0.3    evaluate_1.0.3    grid_4.3.1       
 [9] timechange_0.3.0  fastmap_1.2.0     jsonlite_1.8.9    Matrix_1.6-1.1   
[13] httr_1.4.7        viridisLite_0.4.2 scales_1.3.0      lazyeval_0.2.2   
[17] cli_3.6.3         rlang_1.1.4       munsell_0.5.1     splines_4.3.1    
[21] withr_3.0.2       yaml_2.3.10       tools_4.3.1       tzdb_0.4.0       
[25] colorspace_2.1-1  vctrs_0.6.5       R6_2.5.1          lifecycle_1.0.4  
[29] htmlwidgets_1.6.4 pkgconfig_2.0.3   pillar_1.10.1     gtable_0.3.6     
[33] glue_1.8.0        data.table_1.16.4 xfun_0.50         tidyselect_1.2.1 
[37] rstudioapi_0.17.1 knitr_1.49        htmltools_0.5.8.1 rmarkdown_2.29   
[41] compiler_4.3.1   

Calculate the expected proportion of first-on-checklist observations for each species, then determine the ratio between the observed first-on-checklist proportion and the expected first-on-checklist proportion for each species, and plot that against reporting rate:

(confidence intervals estimated via non-parametric bootstrap)

Show code:
BT_all <- BT_all %>% 
  select(checklist_id, scientific_name, observer_id, 
         observation_date, observation_time, n_species)

BT_nlists <- BT_all %>% 
  pull(checklist_id) %>% 
  n_distinct()

BT_listids <- BT_all %>% 
  pull(checklist_id) %>% 
  unique()

rebuild_bootstrap <- FALSE

if (rebuild_bootstrap == TRUE | !file.exists("Data/BT_preference.RData")) {
  for (i in 1:1000) {
    BT_bs <- tibble(checklist_number = 1:BT_nlists) %>% 
      mutate(checklist_id = sample(BT_listids, BT_nlists, replace = TRUE)) %>% 
      left_join(., BT_all, by = "checklist_id", relationship = "many-to-many") %>% 
      group_by(checklist_id, checklist_number) %>% 
      mutate(first_observation = row_number() == 1) %>% 
      group_by(scientific_name) %>% 
      mutate(n_first = sum(first_observation),
             n_expected = sum(1 / n_species)) %>% 
      ungroup() %>% 
      mutate(BT_preference = n_first / n_expected) %>% 
      select(scientific_name, BT_preference) %>% 
      distinct() %>% 
      left_join(tibble(scientific_name = BT_rrs$scientific_name), .,
                by = "scientific_name") %>% 
      mutate(bs_i = i)
    
    if (i == 1){
      BT_bs_all = BT_bs
    } else {
      BT_bs_all = bind_rows(BT_bs_all, BT_bs)
    }
  }
  
  BT_preference <- BT_bs_all %>% 
    group_by(scientific_name) %>% 
    summarise(BT_preference_est = median(BT_preference),
              BT_preference_lci = if_else(
                sum(is.na(BT_preference)) == 0,
                quantile(BT_preference, 0.025, na.rm = TRUE), NA),
              BT_preference_uci = if_else(
                sum(is.na(BT_preference)) == 0,
                quantile(BT_preference, 0.975, na.rm = TRUE), NA)) %>% 
    rowwise() %>% 
    mutate(
      CI_calculable = as.logical(min(c(
        !is.na(BT_preference_lci), !is.na(BT_preference_uci),
        BT_preference_lci != BT_preference_uci), na.rm = TRUE)),
      EST_calculable = as.logical(min(c(
        BT_preference_est < Inf, BT_preference_est > 0, 
        !is.na(BT_preference_est)), na.rm = TRUE)))
  
  save(BT_preference, file = "Data/BT_preference.RData")
  
  rm(BT_bs, BT_bs_all)
  
} else{
  load("Data/BT_preference.RData")
}

plot1_data_preference <- left_join(BT_preference, BT_rrs, by = "scientific_name") %>% 
  filter(CI_calculable == TRUE, EST_calculable ==  TRUE) 

plot1_gam_model <- gam(data = plot1_data_preference,
                       formula = log10(BT_preference_est) ~ 
                         s(log10(reporting_rate), bs = "ds", k = 4),
                       gamma = 1.4)

plot1_data_gam <- tibble(reporting_rate = plot1_data_preference %>% 
                           pull(reporting_rate) %>% 
                           min() %>% 
                           log10() %>% 
                           seq(from = ., to = 0, by = 0.1) %>% 
                           `^`(10, .)) 
plot1_data_gam <- bind_cols(plot1_data_gam, predict(plot1_gam_model, plot1_data_gam, se.fit = TRUE)) %>% 
  mutate(BT_preference_est = 10 ^ fit,
         BT_preference_lci = 10 ^ (fit + qnorm(0.025) * se.fit),
         BT_preference_uci = 10 ^ (fit + qnorm(0.975) * se.fit))

plot1 <- ggplot() +
  geom_ribbon(data = plot1_data_gam, alpha = 0.2,
              mapping = aes(x = reporting_rate, 
                            ymin = BT_preference_lci, ymax = BT_preference_uci)) +
  geom_line(data = plot1_data_gam, size = 1, linetype = "solid",
            mapping = aes(x = reporting_rate, y = BT_preference_est)) +
  geom_linerange(data = plot1_data_preference %>% filter(BT_preference_lci == 0),
                 linetype = "dotted", alpha = 0.2,
                 mapping = aes(x = reporting_rate, 
                               ymin = BT_preference_lci, ymax = BT_preference_est)) +
  geom_linerange(data = plot1_data_preference %>% filter(BT_preference_lci == 0),
                 linetype = "solid", alpha = 0.2,
                 mapping = aes(x = reporting_rate,
                               ymin = BT_preference_est, ymax = BT_preference_uci)) +
  geom_linerange(data = plot1_data_preference %>% filter(BT_preference_lci > 0),
                 linetype = "solid", alpha = 0.2,
                 mapping = aes(x = reporting_rate,
                               ymin = BT_preference_lci, ymax = BT_preference_uci)) +
  geom_point(data = plot1_data_preference,
             mapping = aes(x = reporting_rate, y = BT_preference_est, label = scientific_name)) +
  geom_hline(yintercept = 1, linetype = "dashed") +
  scale_x_log10(labels = scales::percent) +
  scale_y_log10(breaks = c(1/10, 1/3, 1, 3, 10, 30), 
                labels = c("1:10", "1:3", "1:1", "3:1", "10:1", "30:1")) + 
  expand_limits(y = c(1/10)) +
  labs(x = "Species Reporting Rate", 
       y = "BirdTrack Preference Ratio") +
  theme_classic()

ggplotly(plot1)
Show code:
ggsave(plot = plot1, filename = "BT_preference.png", width = 9, height = 6)

Table of species information:

Show code:
left_join(BT_preference, BT_rrs, by = "scientific_name") %>% 
  mutate(across(is.numeric, round, digits = 4)) %>% 
  select(scientific_name, reporting_rate, BT_preference_est,
         BT_preference_lci, BT_preference_uci, 
         EST_calculable, CI_calculable) %>% 
  rename("Scientific Name" = scientific_name, 
         "Reporting Rate" = reporting_rate,
         "BirdTrack Preference Ratio" = BT_preference_est,
         "LCI" = BT_preference_lci,
         "UCI" = BT_preference_uci,
         "Proper CI?" = CI_calculable,
         "Proper Estimate?" = EST_calculable) %>% 
  datatable(filter = "top")

Explore the effect of first-of-year observations on BirdTrack preference ratio:

Show code:
BT_yeareffect <- BT_all %>%
  group_by(checklist_id) %>% 
  mutate(first_observation = row_number() == 1) %>% 
  ungroup() %>% 
  arrange(observation_date, observation_time) %>% 
  group_by(observer_id, scientific_name) %>% 
  mutate(first_of_year = row_number() == 1) %>% 
  group_by(scientific_name, first_of_year) %>% 
  mutate(n_obs = n(), n_first = sum(first_observation),
         n_expected = sum(1 / n_species)) %>% 
  ungroup() %>% 
  mutate(first_prop = n_first / n_obs,
         expected_prop = n_expected / n_obs,
         prop_ratio = first_prop / expected_prop) %>% 
  select(scientific_name, first_of_year, BT_preference = prop_ratio) %>% 
  distinct() %>% 
  pivot_wider(names_from = first_of_year, names_prefix = "first_",
              values_from = BT_preference) %>% 
  left_join(BT_rrs, by = "scientific_name") %>% 
  filter(reporting_rate > 0.01)

plot2 <- ggplot(BT_yeareffect) +
  geom_point(aes(x = first_TRUE, y = first_FALSE, label = scientific_name)) +
  geom_abline(linetype = "dotted") +
  coord_equal() +
  theme_classic() +
  labs(x = "BirdTrack Preference Ratio: First of Year Observations", 
       y = "BirdTrack Preference Ratio: NOT First of Year Observations") +
  scale_x_log10(breaks = c(1/3, 1, 3), 
                labels = c("1:3", "1:1", "3:1"),
                limits = c(1/3, 3)) + 
  scale_y_log10(breaks = c(1/3, 1, 3), 
                labels = c("1:3", "1:1", "3:1"),
                limits = c(1/3, 3))

ggplotly(plot2)
Show code:
ggsave(plot = plot2, filename = "YearOrderPlot.png", height = 8, width = 9)
Show code:
cor.test(formula = ~ log(first_TRUE) + log(first_FALSE), data = BT_yeareffect)

    Pearson's product-moment correlation

data:  log(first_TRUE) and log(first_FALSE)
t = 10.62, df = 147, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5571273 0.7411712
sample estimates:
      cor 
0.6588991